# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
# load in the dataset into a pandas dataframe, print statistics
loans = pd.read_csv('prosperLoanData.csv')
# high-level overview of data shape and composition
loans.shape
loans.dtypes
loans.EmploymentStatus.value_counts()
loans.head(10)
# descriptive statistics for numeric variables
loans.describe()
There are 113,937 loans in the dataset with 81 features on each loan. Most variables are numeric in nature, but the variables such as CreditGrade which is an object. There is also ordered factor variables such as ProsperScore with the score ranges from 1-10, with 10 being the best, or lowest risk score. Applicable for loans originated after July 2009.
I'm most interested in figuring out:
- What affects the borrower’s APR or interest rate?
- Are there differences between loans depending on how large the original loan amount was?
I think analyzing below features will help to answer above questions.
- ProsperScore
- BorrowerState
- EmploymentStatus
- EmploymentStatusDuration
- Occupation
- CurrentCreditLines
- TotalCreditLinespast7years
- OpenCreditLines
- DebtToIncomeRatio
- IncomeRange
- Recommendations
In this section, investigate distributions of individual variables. If you see unusual points or outliers, take a deeper look to clean things up and prepare yourself to look at relationships between variables.
I'll start by looking at the distribution of the main variable of interest: Loan Status.
# start with a standard-scaled plot
plt.figure(figsize=[8, 5])
base_color=sb.color_palette()[0]
LoanStatus_order=loans["LoanStatus"].value_counts().index
sb.countplot(data = loans, y = 'LoanStatus',color=base_color,order=LoanStatus_order)
plt.ylabel('Loan Status')
plt.show()
# Since the count of a copule of columns is too high, we cannot see the count of other ones.
# Let's zome in and see what is going on there.
plt.figure(figsize=[8, 5])
base_color=sb.color_palette()[0]
LoanStatus_order=loans["LoanStatus"].value_counts().index
sb.countplot(data = loans, y = 'LoanStatus',color=base_color,order=LoanStatus_order)
plt.ylabel('Loan Status')
plt.xlim(0,1000)
plt.show()
Well, that is interesting, some categories of Loan Status are really almost zero, such as Cancelled and Past Due (>120).
We can say the main categories of Loan Status are: "Completed", "Current", and "chargeoff" and the rest of categories has <1000 count
Next up, the first predictor variable of interest: Borrower Rate.
# plotting Borrower Rate on a standard scale
binsize = 0.05
bins = np.arange(0, loans['BorrowerRate'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data=loans,x="BorrowerRate", bins = bins)
plt.xlim([0,0.5])
plt.xlabel('Borrower Rate (APR)')
plt.show()
# investigating further on an even smaller bin size
binsize = 0.01
bins = np.arange(0, loans['BorrowerRate'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data=loans,x="BorrowerRate", bins = bins)
#plt.xlim([0.31,0.32])
plt.xlabel('Borrower Rate (APR)')
plt.show()
# investigating further on an even smaller bin size
binsize = 0.0015
bins = np.arange(0, loans['BorrowerRate'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data=loans,x="BorrowerRate", bins = bins)
plt.xlim([0.31,0.32])
plt.xlabel('Borrower Rate (APR)')
plt.show()
At both large and small bin sizes the Borrower Rate seems to have a normal distrubution. So, I do not need to use any transformation. That is interesting that there is a sharp pick in between 0.316 and 0.317!
Next up, the first predictor variable of interest: Loan Original Amount.
# plotting Loan Original Amount on a standard scale
binsize = 5000
bins = np.arange(0, loans['LoanOriginalAmount'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data=loans,x="LoanOriginalAmount", bins = bins)
plt.xlim([0,35500])
plt.xlabel('Loan Original Amount')
plt.show()
# there's a long tail in the distribution, so let's put it on a log scale instead
log_binsize = 0.125
# To find the min and max use:
# np.log10(loans['LoanOriginalAmount'].describe())
bins = 10 ** np.arange(3.0, np.log10(loans['LoanOriginalAmount'].max())+log_binsize, log_binsize)
# min is: 3.000000 and max is: 4.544068.
# so ticks could be calculated by min= 10**3.000000=1000 and max= 10**4.5=30000
ticks=[1000, 3000, 10000, 30000]
#labels= ["{}".format(v) for v in ticks]
labels=["1k", "3k", "10k","30k"]
plt.figure(figsize=[8, 5])
plt.hist(data = loans, x = 'LoanOriginalAmount', bins = bins)
plt.xscale('log')
plt.xticks(ticks,labels)
plt.xlabel('Loan Original Amount')
plt.show()
Loan Original Amount has a long-tailed distribution, with a lot of loans on the low "Loan Original Amount" end, and few on the high "Loan Original Amount" end. When plotted on a log-scale, the price distribution looks roughly bimodal, with one peak between 3000 and 4000, and a second peak a little below right after 10000.
Next up, the first predictor variable of interest: ProsperScore.
# plotting Loan ProsperScore Amount on a standard scale
# The score ranges from 1-10, with 10 being the best, or lowest risk score.
# (worst) ——> 1,2,3,4,5,6,7,8,9,10,11
# ProsperScore: 1.
# first the ProsperScore type should be changed to categorical!
loans_ProsperScore_noNaN=loans.copy()
loans_ProsperScore_noNaN=loans_ProsperScore_noNaN[~loans_ProsperScore_noNaN['ProsperScore'].isnull()]
ordinal_var_dict={"ProsperScore":[1,2,3,4,5,6,7,8,9,10,11,12,13]}
for var in ordinal_var_dict:
pd_ver=pd.__version__.split(".")
if (int(pd_ver[0])>0) or (int(pd_ver[1])>=21): #v0.21 or later
ordered_var=pd.api.types.CategoricalDtype(ordered=True,categories=ordinal_var_dict[var])
loans_ProsperScore_noNaN[var]=loans_ProsperScore_noNaN[var].astype("category")
else: # pre_v0.21
loans_ProsperScore_noNaN[var]=loans_ProsperScore_noNaN.astype("category",
ordered=True,categories=ordinal_var_dict[var])
#plotting
plt.figure(figsize=[10, 8])
base_color=sb.color_palette()[0]
sb.countplot(data = loans_ProsperScore_noNaN, x = 'ProsperScore',color=base_color)
plt.xlabel('ProsperScore')
plt.show()
A custom risk score built using historical Prosper data. The score ranges from 1-10, with 10 being the best, or lowest risk score. ProsperScore is normally distributed.
Next up, the first predictor variable of interest: BorrowerState.
# start with a standard-scaled plot
plt.figure(figsize=[8, 10])
base_color=sb.color_palette()[0]
BorrowerState_order=loans["BorrowerState"].value_counts().index
sb.countplot(data = loans, y = 'BorrowerState',color=base_color,order=BorrowerState_order)
plt.ylabel('BorrowerState')
plt.show()
plt.figure(figsize=[10,10])
stored_counts=loans["BorrowerState"].value_counts()
plt.pie(stored_counts, labels=stored_counts.index,startangle=90, counterclock=False,autopct='%1.0f%%');
plt.axis("square");
Apparently the number of loan borrower is much higher in CA (14%) than every other states, ~15000. After CA, the number of loan borrowers in GA, IL, FL, NY, and TX ranges between 5000 to 7000 (~5-6%) and all other states has <5000 borrowers.
Next up, the first predictor variable of interest: Employment Status.
#Employment Status
plt.figure(figsize=[8,5])
EmploymentStatus_order=loans["EmploymentStatus"].value_counts().index
base_color=sb.color_palette()[0]
sb.countplot(data = loans, y = 'EmploymentStatus',color=base_color, order=EmploymentStatus_order)
plt.ylabel('Employment Status')
plt.show()
By far, number of people who borrowed loans are emplyed, ~ 70,000. Besides ~28,000 of them are full time. The proportion of other categiories are less than 10000.
Next up, the first predictor variable of interest: EmploymentStatusDuration.
#EmploymentStatusDuration
loans_EmploymentStatusDuration_noNaN=loans.copy()
loans_EmploymentStatusDuration_noNaN=loans_EmploymentStatusDuration_noNaN[~loans_EmploymentStatusDuration_noNaN['EmploymentStatusDuration'].isnull()]
loans_EmploymentStatusDuration_noNaN_noZero=loans_EmploymentStatusDuration_noNaN.query('EmploymentStatusDuration!=0')
binsize = 20
bins = np.arange(0, loans_EmploymentStatusDuration_noNaN_noZero['EmploymentStatusDuration'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data=loans_EmploymentStatusDuration_noNaN_noZero,x="EmploymentStatusDuration", bins = bins)
#plt.xlim([0,0.5])
plt.xlabel('Employment Status Duration')
plt.show()
# there's a long tail in the distribution, so let's put it on a log scale instead
log_binsize = 0.15
# To find the min and max use:
# np.log10(loans['LoanOriginalAmount'].describe())
bins = 10 ** np.arange(0, 3+log_binsize, log_binsize)
# min is: 3.000000 and max is: 4.544068.
# so ticks could be calculated by min= 10**0.=1 and max= 10**3=1000
ticks=[1, 3, 10, 30, 100, 300,1000]
labels= ["{}".format(v) for v in ticks]
#labels=["1k", "3k", "10k","30k"]
plt.figure(figsize=[8, 5])
plt.hist(data = loans_EmploymentStatusDuration_noNaN_noZero, x = 'EmploymentStatusDuration', bins = bins)
plt.xscale('log')
plt.xticks(ticks,labels)
plt.xlabel('EmploymentStatusDuration')
plt.show()
EmploymentStatusDuration has a long-tailed distribution, with a lot of loaners on the low EmploymentStatusDuration end, and few on the high EmploymentStatusDuration end. When plotted on a log-scale, the EmploymentStatusDuration distribution looks normally distributed with the highest peak between 50 and 200.
Next up, the first predictor variable of interest: Occupation.
#Occupation
plt.figure(figsize=[8,15])
Occupation_order=loans["Occupation"].value_counts().index
base_color=sb.color_palette()[0]
sb.countplot(data = loans, y = 'Occupation',color=base_color,order=Occupation_order)
plt.ylabel('Occupation')
plt.show()
In gerenral, studnets has the least number of loans.The mximum amount of loan belongs to "others" occupation, which I do not have any idea what it means! After "others, professionals with 15,000 are in the second place and all other jobs has <5000 number.
Next up, the first predictor variable of interest: CurrentCreditLines.
loans_CurrentCreditLines_noNaN=loans.copy()
loans_CurrentCreditLines_noNaN=loans_CurrentCreditLines_noNaN[~loans_CurrentCreditLines_noNaN['CurrentCreditLines'].isnull()]
#loans_CurrentCreditLines_noNaN_noZero=loans_CurrentCreditLines_noNaN.query('EmploymentStatusDuration!=0')
binsize = 1
bins = np.arange(0, loans_CurrentCreditLines_noNaN['CurrentCreditLines'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data=loans_CurrentCreditLines_noNaN,x="CurrentCreditLines", bins = bins)
#plt.xlim([0,0.5])
plt.xlabel('CurrentCreditLines')
plt.show()
CurrentCreditLines is a little bit skewed to the right. But, it is almost normal distribution.
Next up, the first predictor variable of interest: TotalCreditLinespast7years.
#TotalCreditLinespast7years
loans_TotalCreditLinespast7years_noNaN=loans.copy()
loans_TotalCreditLinespast7years_noNaN=loans_TotalCreditLinespast7years_noNaN[~loans_TotalCreditLinespast7years_noNaN['TotalCreditLinespast7years'].isnull()]
loans_TotalCreditLinespast7years_noNaN_noZero=loans_TotalCreditLinespast7years_noNaN.query('TotalCreditLinespast7years!=0')
binsize = 2
bins = np.arange(0, loans_TotalCreditLinespast7years_noNaN_noZero['TotalCreditLinespast7years'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data=loans_TotalCreditLinespast7years_noNaN_noZero,x="TotalCreditLinespast7years", bins = bins)
#plt.xlim([0,0.5])
plt.xlabel('TotalCreditLinespast7years')
plt.show()
TotalCreditLinespast7years is a little bit skewed to the right. But, it is almost normal distribution.
Next up, the first predictor variable of interest: OpenCreditLines.
# OpenCreditLines
loans_OpenCreditLines_noNaN=loans.copy()
loans_OpenCreditLines_noNaN=loans_OpenCreditLines_noNaN[~loans_OpenCreditLines_noNaN['OpenCreditLines'].isnull()]
loans_TotalCreditLinespast7years_noNaN_noZero=loans_OpenCreditLines_noNaN.query('OpenCreditLines!=0')
binsize = 2
bins = np.arange(0, loans_TotalCreditLinespast7years_noNaN_noZero['OpenCreditLines'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data=loans_TotalCreditLinespast7years_noNaN_noZero,x="OpenCreditLines", bins = bins)
#plt.xlim([0,0.5])
plt.xlabel('OpenCreditLines')
plt.show()
OpenCreditLines is a little bit skewed to the right. But, it is almost normal distribution.
Next up, the first predictor variable of interest: DebtToIncomeRatio.
#DebtToIncomeRatio
loans_DebtToIncomeRatio_noNaN=loans.copy()
loans_DebtToIncomeRatio_noNaN=loans_DebtToIncomeRatio_noNaN[~loans_DebtToIncomeRatio_noNaN['DebtToIncomeRatio'].isnull()]
loans_DebtToIncomeRatio_noNaN_noZero=loans_DebtToIncomeRatio_noNaN.query('DebtToIncomeRatio!=0')
binsize = 0.01
bins = np.arange(0, loans_DebtToIncomeRatio_noNaN_noZero['DebtToIncomeRatio'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data=loans_DebtToIncomeRatio_noNaN_noZero,x="DebtToIncomeRatio", bins = bins)
#plt.xlim([0,0.5])
plt.xlabel('DebtToIncomeRatio')
#plt.xlim(1,12)
plt.show()
The initial plot of the DebtToIncomeRatio show some immediate points of attention. Most of the data is set to the far left of their axes, suggesting some strong outliers on the right. It's worth taking a bit of time to identify these outliers and see if they need to be filtered out of the data.
# select low outliers, using criteria eyeballed from the plots
high_outliers = (loans_DebtToIncomeRatio_noNaN_noZero['DebtToIncomeRatio'] > 1.5)
print(high_outliers.sum())
print(loans_DebtToIncomeRatio_noNaN_noZero.loc[high_outliers,:])
# well, there is around 555 outliers! I am not sure to remove them or not! If removing is necessary, this is the way:
# remove points with inconsistent depth values.
loans_DebtToIncomeRatio_noNaN_noZero = loans_DebtToIncomeRatio_noNaN_noZero.loc[-high_outliers,:]
binsize = 0.04
bins = np.arange(0, loans_DebtToIncomeRatio_noNaN_noZero['DebtToIncomeRatio'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data=loans_DebtToIncomeRatio_noNaN_noZero,x="DebtToIncomeRatio", bins = bins)
#plt.xlim([0,0.5])
plt.xlabel('DebtToIncomeRatio')
#plt.xlim(1,12)
plt.show()
DebtToIncomeRatio has uninomial distribution with a peak around 0.2
Next up, the first predictor variable of interest: IncomeRange_order.
loans.IncomeRange.value_counts()
#IncomeRange
plt.figure(figsize=[8, 5])
base_color=sb.color_palette()[0]
IncomeRange_order=["Not displayed", "Not employed", "$0","$1-24,999","$25,000-49,999",
"$50,000-74,999","$75,000-99,999","$100,000+" ]
sb.countplot(data = loans, x = 'IncomeRange',color=base_color,order=IncomeRange_order)
plt.xlabel('IncomeRange')
plt.xticks(rotation=45)
#plt.xlim(0,1000)
plt.show()
Clearly, most of ("$32,000") loaners are have a salary of "$25,000-49,999" and "$50,000-74,999." After that, on the second level ("~$15,000"), laoners are making "$75,000-99,999" and "$100,000+." Rest of loaner, either are not employed, or making "<$25,000"
Next up, the first predictor variable of interest: Recommendations.
#Recommendations
binsize = 1
bins = np.arange(0, loans['Recommendations'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data=loans,x="Recommendations", bins = bins)
#plt.xlim([0,4])
plt.xlabel('Recommendations')
plt.show()
Most of loaners has no recommenders. But there are a couple of outliers.
# outliers of Recommendations
print(loans.query('Recommendations>5').Recommendations.sum())
print(loans.query('Recommendations>5').Recommendations)
There are 322 outliers! It is not a big deal I think, I will keep them for now.
I looked at a couple of categorical and numerical variables. It is described below where log tranformation is used.
1. Categorical:
LoanStatus
ProsperScore
BorrowerState
EmploymentStatus
Occupation
IncomeRange
2. Numerical:
2.1. Normal Distribution:
BorrowerRate (uninomial)
CurrentCreditLines(uninomial)
TotalCreditLinespast7years(uninomial)
OpenCreditLines(uninomial)
DebtToIncomeRatio(uninomial, has outliers)
DebtToIncomeRatio(uninomial)
Recommendations(uninomial, has outliers)
2.2. Log-Normal Distribution:
LoanOriginalAmount (Binomial)
EmploymentStatusDuration (uninomial)
When investigating the OpenCreditLines and DebtToIncomeRatio variables, a number of outlier points were identified. Althought, it may be safer to remove these outliers, I kept them for now. However, I wrote the code for removing the outliers which later could be applied on the data.
To start off with, I want to look at the pairwise correlations present between features in the data.
ordinal_var_dict={"ProsperScore":[1,2,3,4,5,6,7,8,9,10,11,12,13]}
for var in ordinal_var_dict:
pd_ver=pd.__version__.split(".")
if (int(pd_ver[0])>0) or (int(pd_ver[1])>=21): #v0.21 or later
ordered_var=pd.api.types.CategoricalDtype(ordered=True,categories=ordinal_var_dict[var])
loans_noNAN[var]=loans_noNAN[var].astype("category")
else: # pre_v0.21
loans_noNAN[var]=loans_noNAN.astype("category", ordered=True,categories=ordinal_var_dict[var])
numeric_vars = ['BorrowerRate', 'CurrentCreditLines', 'TotalCreditLinespast7years',
'OpenCreditLines', 'DebtToIncomeRatio',
'Recommendations','LoanOriginalAmount','EmploymentStatusDuration']
#categoric_vars = ['LoanStatus', 'ProsperScore', 'EmploymentStatus','BorrowerState','Occupation', 'IncomeRange']
categoric_vars1 = ['LoanStatus', 'ProsperScore', 'EmploymentStatus','IncomeRange']
# correlation plot
plt.figure(figsize = [8, 5])
sb.heatmap(loans[numeric_vars].corr(), annot = True, fmt = '.3f',
cmap = 'vlag_r', center = 0)
plt.show()
There is not a good correlation between most features. However, the correlation between:
- OpenCreditLines and CurrentCreditLines is 0.96
- CurrentCreditLines and TotalCreditLinespast7years is 0.628
- OpenCreditLines and TotalCreditLinespast7years is 0.587
Besides, there is a negative relationship between LoanOrgingalAmount and BorrowRate
loans.shape
loans_noNAN=loans.copy()
loans_noNAN=loans_noNAN.dropna(subset=numeric_vars)
loans_noNAN.shape
# plot matrix: sample 500 diamonds so that plots are clearer and
# they render faster
samples = np.random.choice(loans_noNAN.shape[0], 500, replace = False)
loans_samp = loans_noNAN.loc[samples,:]
sb.set(font_scale = 1.5)
g = sb.PairGrid(data = loans_noNAN, vars = numeric_vars)
g = g.map_diag(plt.hist, bins = 20);
g.map_offdiag(plt.scatter);
As expected, most features are poorly correlated except OpenCreditLines, CurrentCreditLines and TotalCreditLinespast7years. The shown negative correlations in heatmap may not need any further analysis.
Let's move on to looking at how numeric_vars correlate with the categorical variables.
# plot matrix of numeric features against categorical features.
# can use a larger sample since there are fewer plots and they're simpler in nature.
samples = np.random.choice(loans_noNAN.shape[0], 2000, replace = False)
diamonds_samp = loans_noNAN.loc[samples,:]
def boxgrid(x, y, **kwargs):
""" Quick hack for creating box plots with seaborn's PairGrid. """
default_color = sb.color_palette()[0]
sb.boxplot(x, y, color = default_color)
plt.xticks( rotation= 90 )
plt.figure(figsize = [10, 10])
g = sb.PairGrid(data = diamonds_samp, y_vars = numeric_vars,
x_vars = categoric_vars1, size = 3, aspect = 1.5)
g.map(boxgrid);
plt.show();
g = sb.PairGrid(data = diamonds_samp, y_vars = numeric_vars,
x_vars = "BorrowerState", size =3, aspect =5)
#categoric_vars2 = ['BorrowerState','Occupation']
g.map(boxgrid)
plt.show();
g = sb.PairGrid(data = diamonds_samp, y_vars = numeric_vars,
x_vars = "Occupation", size =3, aspect =5)
g.map(boxgrid)
plt.show();
Insights of categorical variables vs. numeric_vars
1. 'LoanStatus':
No clear relationship
2. 'ProsperScore':
negative rationship with BorrowerRate (increasing ProsperScore decreases the BorrowerRate), and positive relationship with LoanOriginalAmount
3. 'EmploymentStatus':
No clear relationship
4. 'IncomeRange':
*Positive relationship with:*
CurrentCreditLines
TotalCreditLinespast7years
OpenCreditLines
LoanOriginalAmount
*Negative relationship with:*
BorrowerRate
5. 'BorrowerState':
No clear relationship
6. 'Occupation':
No clear relationship
Finally, let's look at relationships between the 6 categorical features.
#categoric_vars = ['LoanStatus', 'ProsperScore', 'EmploymentStatus','BorrowerState','Occupation', 'IncomeRange']
# since there's 15 subplots to create, using the full data should be fine.
plt.figure(figsize = [20, 30])
Rotation=90
ncolno=10
# subplot 1:
ax1=plt.subplot(5, 1, 1)
sb.countplot(data = loans_noNAN, x = 'LoanStatus', hue = 'ProsperScore', palette = 'viridis_r')
ax1.legend(loc = 1, ncol = ncolno) # re-arrange legend to remove overlapping
plt.xticks(rotation=Rotation)
# subplot 2:
ax2=plt.subplot(5, 1, 2)
sb.countplot(data = loans_noNAN, x = 'LoanStatus', hue = 'EmploymentStatus', palette = 'colorblind')
ax2.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
plt.xticks(rotation=Rotation)
# subplot 3:
ax3 = plt.subplot(5, 1, 3)
sb.countplot(data = loans_noNAN, x = 'ProsperScore', hue = 'EmploymentStatus', palette = 'colorblind')
ax3.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
plt.xticks(rotation=Rotation)
# subplot 4:
ax4= plt.subplot(5, 1, 4)
sb.countplot(data = loans_noNAN, x = 'ProsperScore', hue = 'IncomeRange', palette = 'colorblind')
ax4.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
plt.xticks(rotation=Rotation)
# subplot 5:
ax5= plt.subplot(5, 1, 5)
sb.countplot(data = loans_noNAN, x = 'EmploymentStatus', hue = 'IncomeRange', palette = 'colorblind')
ax5.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
plt.xticks(rotation=Rotation)
plt.subplots_adjust(hspace=1.2)
plt.show()
Insights of ategorical features versus each other:
Subplot 1: Most of those who completed their loans has a ProsperScore above 3. Most of those who currently has loan, has a ProsperScore of 4, and the minimum count of loaners belong to those who has a ProsperScore of 1.
Subplot 2: Among those people who currenly has a loan, ~50,000 of them are employed.
Subplot 3: the emplyed people are the most populated loan borrowers no matter what is their ProsperScore. If their ProsperScore is between 4 and 8, the the population of employed loan borrowers in each particular ProsperScore is between 6000 and 8000.
Subplot 4: In ProsperScore of 2 to 10, the number of loaners who has a salary of 25-49k and 50-74k is more than others and if the ProsperScore is between 4 and 8, the popluation of both salries of 25-49k and 50-74k ranges are above 2500.
Subplot 5: Among those people who are employed, respectively the most popluated salary ranges are: 50-75k, 25-49k, >100k and 75-99k
#categoric_vars = ['LoanStatus', 'ProsperScore', 'EmploymentStatus','BorrowerState','Occupation', 'IncomeRange']
# since there's 15 subplots to create, using the full data should be fine.
#plt.figure(figsize = [20, 60])
#Rotation=90
#ncolno=10
# subplot 1:
#x1=plt.subplot(15, 1, 1)
#sb.countplot(data = loans_noNAN, x = 'LoanStatus', hue = 'ProsperScore', palette = 'viridis_r')
#ax1.legend(loc = 1, ncol = ncolno) # re-arrange legend to remove overlapping
#plt.xticks(rotation=Rotation)
# subplot 2:
#ax2=plt.subplot(15, 1, 2)
#sb.countplot(data = loans_noNAN, x = 'LoanStatus', hue = 'EmploymentStatus', palette = 'deep')
#ax2.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
#plt.xticks(rotation=Rotation)
#subplot 3:
#ax3 = plt.subplot(15, 1, 3)
#sb.countplot(data = loans_noNAN, x = 'LoanStatus', hue = 'BorrowerState', palette = 'pastel')
#ax3.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
#plt.xticks(rotation=Rotation)
#ax4 = plt.subplot(15, 1, 4)
#sb.countplot(data = loans_noNAN, x = 'LoanStatus', hue = 'Occupation', palette = 'dark')
#ax4.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
#plt.xticks(rotation=Rotation)
#ax5 = plt.subplot(15, 1, 5)
#sb.countplot(data = loans_noNAN, x = 'LoanStatus', hue = 'IncomeRange', palette = 'bright')
#ax5.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
#plt.xticks(rotation=Rotation)
#ax6 = plt.subplot(15, 1, 6)
#sb.countplot(data = loans_noNAN, x = 'ProsperScore', hue = 'EmploymentStatus', palette = 'colorblind_r')
#ax6.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
#plt.xticks(rotation=Rotation)
#ax7 = plt.subplot(15, 1, 7)
#sb.countplot(data = loans_noNAN, x = 'ProsperScore', hue = 'BorrowerState', palette = 'Greens')
#ax7.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
#plt.xticks(rotation=Rotation)
#ax8 = plt.subplot(15, 1, 8)
#sb.countplot(data = loans_noNAN, x = 'ProsperScore', hue = 'Occupation', palette = 'Greens')
#ax8.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
#plt.xticks(rotation=Rotation)
#ax9= plt.subplot(15, 1, 9)
#sb.countplot(data = loans_noNAN, x = 'ProsperScore', hue = 'IncomeRange', palette = 'deep')
#ax9.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
#plt.xticks(rotation=Rotation)
#ax10 = plt.subplot(15, 1, 10)
#sb.countplot(data = loans_noNAN, x = 'EmploymentStatus', hue = 'BorrowerState', palette = 'pastel')
#ax10.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
#plt.xticks(rotation=Rotation)
#ax11 = plt.subplot(15, 1, 11)
#sb.countplot(data = loans_noNAN, x = 'EmploymentStatus', hue = 'Occupation', palette = 'Greens')
#ax11.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
#plt.xticks(rotation=Rotation)
#ax12= plt.subplot(15, 1, 12)
#sb.countplot(data = loans_noNAN, x = 'EmploymentStatus', hue = 'IncomeRange', palette = 'pastel_r')
#ax12.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
#plt.xticks(rotation=Rotation)
#ax13 = plt.subplot(15, 1, 13)
#sb.countplot(data = loans_noNAN, x = 'BorrowerState', hue = 'Occupation', palette = 'Greens')
#ax13.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
#plt.xticks(rotation=Rotation)
#ax14 = plt.subplot(15, 1, 14)
#sb.countplot(data = loans_noNAN, x = 'BorrowerState', hue = 'IncomeRange', palette = 'Greens')
#ax14.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
#plt.xticks(rotation=Rotation)
#ax15 = plt.subplot(15, 1, 15)
#sb.countplot(data = loans_noNAN, x = 'Occupation', hue = 'IncomeRange', palette = 'Greens')
#ax15.legend(loc = 1, ncol = ncolno) # re-arrange legend to reduce overlapping
#plt.xticks(rotation=Rotation)
#plt.subplots_adjust(hspace=1)
#plt.show()
With the preliminary look at bivariate relationships out of the way, I want to dig into some of the relationships more.
Scatter:
- BorrowerRate (numeric) vs. LoanOriginalAmount (numeric)
- BorrowerRate (numeric) vs. TotalCreditLinespast7years (numeric)
Violin:
- BorrowerRate (numeric) vs. IncomeRange (categoric_vars)
Cluster bar chart:
- LoanStatus (categoric) vs. EmploymentStatus (categoric)
- LoanStatus (categorical) vs. ProsperScore (categoric)
# scatter plot of BorrowerRate (numeric) vs. LoanOriginalAmount (numeric)
sample=np.random.choice(loans_noNAN.shape[0],2000, replace=False)
loans_noNAN_subset=loans_noNAN.loc[sample]
plt.figure(figsize = [8, 6])
sb.regplot(data = loans_noNAN_subset, x = 'LoanOriginalAmount', y= 'BorrowerRate')
plt.xlabel('LoanOriginalAmount')
plt.ylabel('BorrowerRate')
plt.show()
The graph shows that by increasing LoanOriginalAmount, BorrowerRate decreases which is along with the results of first bivariable graph (HeatMap Graph).
# scatter plot of BorrowerRate (numeric) vs. TotalCreditLinespast7years (numeric)
sample=np.random.choice(loans_noNAN.shape[0],2000, replace=False)
loans_noNAN_subset=loans_noNAN.loc[sample]
plt.figure(figsize = [8, 6])
sb.regplot(data = loans_noNAN_subset, x = 'TotalCreditLinespast7years', y= 'BorrowerRate')
plt.xlabel('TotalCreditLinespast7years')
plt.ylabel('BorrowerRate')
plt.show()
It seems that TotalCreditLinespast7years does not affect the BorrowerRate.
# violin plot of BorrowerRate (numeric) vs. IncomeRange (categoric_vars)
plt.figure(figsize = [8, 6])
base_color=sb.color_palette()[0]
sb.violinplot(data = loans_noNAN, x = 'IncomeRange', y= 'BorrowerRate',color=base_color, inner="quartile");
plt.xticks(rotation=45);
The plot of the full data using a violin plot suggests that changing income range does not affect the borrow rate and the mean of all borrow rates in different income ranges is ~0.15-0.21
# Heatmap for:EmploymentStatus (categoric), LoanStatus (categorical)
plt.figure(figsize = [20, 20])
ct_counts=loans_noNAN.groupby(["EmploymentStatus", 'LoanStatus']).size().reset_index(name="count")
ct_counts=ct_counts.pivot(index="EmploymentStatus", columns="LoanStatus", values="count")
#ct_counts.fillna(value="0", inplace=True)
ct_counts = ct_counts[ct_counts.columns].astype(float)
plt.figure(figsize = [10,8])
#sb.heatmap(ct_counts,cmap="viridis_r");
sb.heatmap(ct_counts,annot=True,fmt='.0f',cmap="mako_r");
Above graph shows that the number of people with emplyed and full time status who currently have a loan or completed their laons is much higher than other people with other employment status.
ct_counts=loans_noNAN.groupby(["ProsperScore", 'LoanStatus']).size().reset_index(name="count")
ct_counts=ct_counts.pivot(index="ProsperScore", columns="LoanStatus", values="count")
ct_counts.fillna(value="0", inplace=True)
ct_counts = ct_counts[ct_counts.columns].astype(float)
plt.figure(figsize = [10,8])
sb.heatmap(ct_counts,cmap="mako_r");
#sb.heatmap(ct_counts,annot=True,fmt='0.1f',cmap="mako_r");
Above graph shows that 1. in this data most people currenly have a loan or completed their loans and their interest rate is ~4-8.
Most of those who completed their loans has a ProsperScore above 3. Most of those who currently has loan, has a ProsperScore of 4, and the minimum count of loaners belong to those who has a ProsperScore of 1. The majority (~50,000) of people who currenly has a loan are emplyed no matter what is their ProsperScore. If their ProsperScore is between 4 and 8, the population of employed loan borrowers in each particular ProsperScore is between 6000 and 8000. In ProsperScore of 2 to 8, the number of loaners who has a salary of 25-49k and 50-74k is more than others salary groups and if the ProsperScore is between 4 and 8, the popluation of both salries of 25-49k and 50-74k groups ranges between 2500 and 3900. Plots also show that BorrowerRate has a negative relationship with LoanOriginalAmount and has a postive relationship with ProsperScore. IncomeRange also affect the LoanOriginalAmount in a positive way. The plot of the full data using a violin plot suggests that borrow rate is independant of income range and ranges ~0.15-0.21
There is a positive relationshi between ProsperScore and LoanOriginalAmount. The number of people with emplyed and full time status who currently have a loan or completed their laons is much higher than other people with other employment status. In this data, most people currenly have a loan or completed their loans, and their interest rates is ~4-8.
The main thing I want to explore in this part of the analysis is how the three categorical measures of TotalCreditLinespast7years and IncomeRange play into the relationship between BorrowerRate and LoanOriginalAmount.
#np.random.seed(2018)
sample=np.random.choice(loans_noNAN.shape[0],1000, replace=False)
loans_noNAN_subset=loans_noNAN.loc[sample]
plt.figure(figsize = [12,12])
plt.subplot(2,1,1)
ax1=sb.regplot(data=loans_noNAN_subset,x="LoanOriginalAmount",y="BorrowerRate",
x_jitter=0.04, fit_reg=False,
scatter_kws={'s':loans_noNAN_subset["TotalCreditLinespast7years"]*4})
plt.xlabel("LoanOriginalAmount")
plt.ylabel("BorrowerRate")
sizes=[10, 20, 35,50, 75]
base_color=sb.color_palette()[0]
legend_obj=[]
for s in sizes:
legend_obj.append(plt.scatter([],[],s=s*4,color=base_color))
plt.legend(legend_obj,sizes,title="TotalCreditLinespast7years");
plt.subplot(2,1,2)
ax2=plt.scatter(data=loans_noNAN_subset,x="LoanOriginalAmount",y="BorrowerRate",
c="TotalCreditLinespast7years", cmap="viridis_r")
plt.xlabel("LoanOriginalAmount")
plt.ylabel("BorrowerRate")
plt.colorbar(label="TotalCreditLinespast7years",orientation="horizontal");
#plt.ylim(ax1.get_ylim()); # to get exactly the same y lim as the previous subplot
I tried using both size and color to find the effect of TotalCreditLinespast7years in a scatter plot of BorrowerRate vs. LoanOriginalAmount. As it is shown, TotalCreditLinespast7years does not play a significant role on controling the BorrowerRate and LoanOriginalAmount relationship.
#np.random.seed(18)
sample=np.random.choice(loans_noNAN.shape[0],1000, replace=False)
loans_noNAN_subset=loans_noNAN.loc[sample]
IncomeRange_order=[ "$100,000+","$75,000-99,999","$50,000-74,999","$25,000-49,999","$1-24,999","Not displayed" ]
g=sb.FacetGrid(data=loans_noNAN_subset,hue="IncomeRange",hue_order=IncomeRange_order,
size=8, aspect=1.5,palette="colorblind")
g=g.map(sb.regplot,"LoanOriginalAmount","BorrowerRate",x_jitter=0.1,fit_reg=False);
g.add_legend()
plt.xlabel("LoanOriginalAmount")
plt.ylabel("BorrowerRate");
I used color to investigate the effect of IncomeRange on the relationship between BorrowerRate and LoanOriginalAmount. It seems by increasing the LoanOriginalAmount, both BorrowerRate and IncomeRange decrease. Besides, apparently only those who has a IncomeRange above \$100k, are borrowing higher LoanOriginalAmount (>\$25000) and their BorrowerRate is <0.15. Note that, there are other examples which are not following the same trend. For example, you may find, people with a IncomeRange above $100k who has lower LoanOriginalAmount and higher BorrowerRate.
Now, I am curiuos to see how the BorrowerRate and LoanOriginalAmount is changing in each IncomeRange category.
sample=np.random.choice(loans_noNAN.shape[0],10000, replace=False)
loans_noNAN_subset=loans_noNAN.loc[sample]
IncomeRange_order=[ "$100,000+","$75,000-99,999","$50,000-74,999","$25,000-49,999","$1-24,999","Not displayed" ]
g=sb.FacetGrid(data=loans_noNAN_subset,col="IncomeRange",col_order=IncomeRange_order,
size=4, aspect=1.5,palette="colorblind",col_wrap=2)
g=g.map(sb.regplot,"LoanOriginalAmount","BorrowerRate",x_jitter=0.1,fit_reg=True);
g.add_legend()
plt.xlabel("LoanOriginalAmount")
plt.ylim(0,0.40)
plt.ylabel("BorrowerRate");
In gerenral, it could be concluded that, at each particular IncomeRange, by increasing the LoanOriginalAmount, the BorrowerRate decreases.
Let's take a look at the effect of ProsperScore, the relationship between BorrowerRate and LoanOriginalAmount.
sample=np.random.choice(loans_noNAN.shape[0],1000, replace=False)
loans_noNAN_subset=loans_noNAN.loc[sample]
loans_noNAN_subset=loans_noNAN_subset.query('ProsperScore!=0')
loans_noNAN_subset=loans_noNAN_subset[~loans_noNAN_subset['ProsperScore'].isnull()]
score_markers=[["1", "o"], ["2", "x"], ["3", "s"], ["4", "d"], ["5", ">"], ["6", "<"],
["7", "^"],["8", "v"], ["9", "*"], ["10", "1"], ["11", "_"]]
fig = plt.figure(figsize = [15,15])
for i, marker in score_markers:
score=float(i)
data=loans_noNAN_subset.loc[loans_noNAN_subset["ProsperScore"]==score]
sb.regplot(data=data, x="LoanOriginalAmount", y="BorrowerRate",
x_jitter=0.1, fit_reg=False, marker=marker,scatter_kws={'s':150})
plt.xlabel("LoanOriginalAmount")
plt.ylabel("BorrowerRate");
plt.legend(["1","2","3","4","5","6","7","8","9","10","11"]);
It is somehow clumsy to me. Lets see how this graph looks like using a FacetGrid. In each plot, I will show one one ProsperScore.
sample=np.random.choice(loans_noNAN.shape[0],50000, replace=False)
loans_noNAN_subset=loans_noNAN.loc[sample]
#IncomeRange_order=[ "$100,000+","$75,000-99,999","$50,000-74,999","$25,000-49,999","$1-24,999","Not displayed" ]
#col_order=IncomeRange_order,
g=sb.FacetGrid(data=loans_noNAN_subset,col="ProsperScore",
size=8, aspect=1.5,palette="colorblind",col_wrap=2)
g=g.map(sb.regplot,"LoanOriginalAmount","BorrowerRate",x_jitter=0.1,fit_reg=True);
g.add_legend()
plt.xlabel("LoanOriginalAmount")
plt.ylabel("BorrowerRate");
In gerenral, at lower ProsperScore (<8), increasing LoanOriginalAmount decreases the BorrowRate. And at higher ProsperScore (>9), the BorrowRate remain s contnstatnt at a low BorrowRate ~0.1
Now lets see, how IncomeRange affect the ProsperScore and BorrowerRate relationship.
fig = plt.figure(figsize = [8,6])
IncomeRange_order=[ "$100,000+","$75,000-99,999","$50,000-74,999","$25,000-49,999","$1-24,999","Not displayed" ]
ax = sb.pointplot(data = loans_noNAN, x = 'ProsperScore', y = 'BorrowerRate', hue = 'IncomeRange',
hue_order=IncomeRange_order, palette = 'dark', linestyles = '', dodge = 0.4, ci="sd")#
plt.title('ProsperScore vs. BorrowerRate at different IncomeRange')
plt.xlabel('ProsperScore')
plt.ylabel('BorrowerRate')
plt.show();
In general, increasing ProsperScore decreases BorrowerRate. Apperntly, people with lower IncomeRange have higher BorrowerRate at each individual ProsperScore.
TotalCreditLinespast7years does not play a significant role on controling the BorrowerRate and LoanOriginalAmount relationship.
At each particular IncomeRange, by increasing the LoanOriginalAmount, the BorrowerRate decreases. Besides, apparently only those who has a IncomeRange above \$100k, are borrowing higher LoanOriginalAmount (\>$25000) and their BorrowerRate is <0.15.
Apperntly, people with lower IncomeRange have higher BorrowerRate at each individual ProsperScore.